In [1]:
# Import necessary libraries
import pandas as pd
In [2]:
# Load the dataset

data = pd.read_csv("aircraft_engine_maintenance_supervised_learning.csv")
In [3]:
data.head()
Out[3]:
Unnamed: 0 Engine_ID Timestamp Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life
0 0 1 2023-01-01 0:00:00 529.106110 284.745866 3068.290695 1.764369 49.086087 9.164463 61.971112 1084.991320 55.206168 1 27.3
1 1 1 2023-01-01 0:10:00 499.696207 270.464475 2693.907436 1.265819 47.371939 7.289872 79.359045 1016.557471 64.382801 1 21.7
2 2 1 2023-01-01 0:20:00 480.154652 258.314496 3139.764855 1.311214 40.736703 7.144859 88.969967 902.039503 58.907223 1 22.3
3 3 1 2023-01-01 0:30:00 531.114934 281.212266 3043.335177 1.128121 56.214109 6.547841 73.004457 408.693047 50.655640 0 20.9
4 4 1 2023-01-01 0:40:00 499.197208 283.305874 3134.974852 1.567308 47.607585 8.252557 97.416898 1217.009675 48.393398 1 25.0
In [4]:
data.tail()
Out[4]:
Unnamed: 0 Engine_ID Timestamp Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life
715 715 5 2023-01-01 23:10:00 414.677212 224.616493 2739.934722 1.251797 44.823469 6.795081 66.359294 1012.825820 47.324979 0 20.30
716 716 5 2023-01-01 23:20:00 534.222365 284.246641 2963.896803 1.266641 47.426626 7.050664 79.428251 998.980476 32.267225 1 22.00
717 717 5 2023-01-01 23:30:00 488.910939 262.138912 3209.008581 1.783258 44.454666 9.373671 97.021645 1251.538058 40.017720 0 27.60
718 718 5 2023-01-01 23:40:00 NaN 271.861935 2915.592050 1.619702 NaN 8.440549 78.458211 1003.570383 45.091718 1 22.65
719 719 5 2023-01-01 23:50:00 415.567779 222.505995 3174.985740 2.075334 42.598441 10.429919 86.549065 1254.175362 46.596882 1 29.40
In [5]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           720 non-null    int64  
 1   Engine_ID            720 non-null    int64  
 2   Timestamp            720 non-null    object 
 3   Temperature          686 non-null    float64
 4   Pressure             681 non-null    float64
 5   Rotational_Speed     720 non-null    float64
 6   Engine_Health        720 non-null    float64
 7   Fuel_Consumption     650 non-null    float64
 8   Vibration_Level      720 non-null    float64
 9   Oil_Temperature      720 non-null    float64
 10  Altitude             720 non-null    float64
 11  Humidity             720 non-null    float64
 12  Maintenance_Needed   720 non-null    int64  
 13  Remaining_used_life  720 non-null    float64
dtypes: float64(10), int64(3), object(1)
memory usage: 78.9+ KB
In [6]:
data.describe()
Out[6]:
Unnamed: 0 Engine_ID Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life
count 720.000000 720.000000 686.000000 681.000000 720.000000 720.000000 650.000000 720.000000 720.000000 720.000000 720.000000 720.000000 720.000000
mean 359.500000 3.000000 500.071917 269.592322 2989.594389 1.356543 46.492836 7.419655 80.488982 1001.442687 49.691074 0.644444 22.720556
std 207.990384 1.415197 50.110265 25.519526 204.452023 0.336493 4.485993 1.353611 9.961242 211.774371 10.333495 0.479014 3.448688
min 0.000000 1.000000 351.604344 194.364483 2398.058779 0.384061 32.476898 3.486958 51.502253 372.016586 12.655992 0.000000 13.300000
25% 179.750000 2.000000 467.591610 253.859578 2841.774479 1.131308 43.691781 6.517824 73.958559 852.427213 42.675328 0.000000 20.500000
50% 359.500000 3.000000 499.766139 270.002663 2993.425924 1.348170 46.384867 7.381727 80.355930 1012.950873 49.692443 1.000000 22.650000
75% 539.250000 4.000000 533.696565 286.072363 3134.710998 1.574788 49.386030 8.312738 87.391627 1134.953476 56.663203 1.000000 25.000000
max 719.000000 5.000000 669.467631 357.208273 3559.917040 2.603570 67.240246 12.469578 109.728101 1741.281836 89.415037 1.000000 35.600000
In [7]:
# Check missing values using msno

import missingno as msno

msno.bar(data, color='grey')
Out[7]:
<Axes: >
In [8]:
# Display columns with missing values
import seaborn as sns

sns.heatmap(data.isnull())
Out[8]:
<Axes: >
In [9]:
# Impute missing values (replace with median) on numerical columns

numerical_columns = ["Engine_ID", "Temperature", "Pressure", "Rotational_Speed", "Engine_Health",
                    'Fuel_Consumption', 'Vibration_Level', 'Oil_Temperature', 'Altitude',
                     'Humidity', 'Maintenance_Needed']


data.fillna(data[numerical_columns].median(), inplace=True)
In [10]:
numerical_columns
Out[10]:
['Engine_ID',
 'Temperature',
 'Pressure',
 'Rotational_Speed',
 'Engine_Health',
 'Fuel_Consumption',
 'Vibration_Level',
 'Oil_Temperature',
 'Altitude',
 'Humidity',
 'Maintenance_Needed']
In [11]:
data.isnull().sum()
Out[11]:
Unnamed: 0             0
Engine_ID              0
Timestamp              0
Temperature            0
Pressure               0
Rotational_Speed       0
Engine_Health          0
Fuel_Consumption       0
Vibration_Level        0
Oil_Temperature        0
Altitude               0
Humidity               0
Maintenance_Needed     0
Remaining_used_life    0
dtype: int64
In [12]:
# Check missing values using msno after filling with median value

import missingno as msno

msno.bar(data, color='grey')
Out[12]:
<Axes: >

Exploratory Data Analysis¶

In [13]:
# plotting boxplot to visualise outliers
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Univariate analysis (Numerical columns) - use boxplot to view all numerical columns with outliers.
columns_to_check = ['Temperature','Pressure','Rotational_Speed','Engine_Health','Fuel_Consumption',
 'Vibration_Level','Oil_Temperature','Altitude','Humidity']


plt.figure(figsize=(18, 10))
for i, cols in enumerate(columns_to_check, 1):

    plt.subplot(2, 5, i)
    sns.boxplot(data[cols])
    plt.xlabel(cols)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

Univariate analysis¶

In [14]:
# Univariate analysis (Target column i.e. Maintenance_Needed) - Countplot

plt.figure(figsize=(6,4))
sns.countplot(data=data, x="Maintenance_Needed")
Out[14]:
<Axes: xlabel='Maintenance_Needed', ylabel='count'>

Bivariate analysis¶

In [15]:
# Bivariate analysis - Pairplot

sns.pairplot(data[columns_to_check + ["Maintenance_Needed", "Remaining_used_life"]], hue="Maintenance_Needed")
plt.show()

Multivariate analysis¶

In [16]:
# Multivariate analysis - Correlation Heatmap

correlation_matrix = data[columns_to_check + ["Maintenance_Needed", "Remaining_used_life"]].corr()
correlation_matrix
Out[16]:
Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life
Temperature 1.000000 0.940519 0.046193 0.367859 0.699923 0.365465 0.016888 -0.044562 0.007619 0.150548 0.446575
Pressure 0.940519 1.000000 0.037158 0.363584 0.720393 0.360060 0.004292 -0.064315 -0.005532 0.135145 0.426118
Rotational_Speed 0.046193 0.037158 1.000000 0.154165 0.063515 0.156965 -0.089808 -0.033462 -0.073330 0.176332 0.273728
Engine_Health 0.367859 0.363584 0.154165 1.000000 0.245414 0.997212 -0.015014 0.028417 -0.047519 0.349076 0.971913
Fuel_Consumption 0.699923 0.720393 0.063515 0.245414 1.000000 0.246541 0.039242 -0.116406 -0.008178 0.097970 0.306648
Vibration_Level 0.365465 0.360060 0.156965 0.997212 0.246541 1.000000 -0.012863 0.029495 -0.046418 0.348684 0.973311
Oil_Temperature 0.016888 0.004292 -0.089808 -0.015014 0.039242 -0.012863 1.000000 0.050233 0.036015 0.140488 -0.013443
Altitude -0.044562 -0.064315 -0.033462 0.028417 -0.116406 0.029495 0.050233 1.000000 0.045176 0.038855 0.025921
Humidity 0.007619 -0.005532 -0.073330 -0.047519 -0.008178 -0.046418 0.036015 0.045176 1.000000 -0.099261 -0.051098
Maintenance_Needed 0.150548 0.135145 0.176332 0.349076 0.097970 0.348684 0.140488 0.038855 -0.099261 1.000000 0.361361
Remaining_used_life 0.446575 0.426118 0.273728 0.971913 0.306648 0.973311 -0.013443 0.025921 -0.051098 0.361361 1.000000
In [17]:
# correllation heatmap showing levl of correlation in the features
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
Out[17]:
Text(0.5, 1.0, 'Correlation Heatmap')

Feature Engineering:¶

Create a Time-Related Feature¶

Binning 'Hour_of_Day'¶

Data Pre-processing¶

In [18]:
# Create a Time-Related Feature (hour_of_Day)
dataCopy = data.copy()

dataCopy['Timestamp'] = pd.to_datetime(dataCopy['Timestamp'], errors='coerce')

dataCopy = dataCopy.dropna(subset=['Timestamp'])
dataCopy['Hour_of_Day'] = dataCopy['Timestamp'].dt.hour

#display(data.dtypes)

dataCopy.head()
Out[18]:
Unnamed: 0 Engine_ID Timestamp Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life Hour_of_Day
0 0 1 2023-01-01 00:00:00 529.106110 284.745866 3068.290695 1.764369 49.086087 9.164463 61.971112 1084.991320 55.206168 1 27.3 0
1 1 1 2023-01-01 00:10:00 499.696207 270.464475 2693.907436 1.265819 47.371939 7.289872 79.359045 1016.557471 64.382801 1 21.7 0
2 2 1 2023-01-01 00:20:00 480.154652 258.314496 3139.764855 1.311214 40.736703 7.144859 88.969967 902.039503 58.907223 1 22.3 0
3 3 1 2023-01-01 00:30:00 531.114934 281.212266 3043.335177 1.128121 56.214109 6.547841 73.004457 408.693047 50.655640 0 20.9 0
4 4 1 2023-01-01 00:40:00 499.197208 283.305874 3134.974852 1.567308 47.607585 8.252557 97.416898 1217.009675 48.393398 1 25.0 0
In [19]:
# Binning 'Hour_of_Day'
bins = [-1, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
dataCopy['Day_Part'] = pd.cut(dataCopy['Hour_of_Day'], bins=bins, labels=labels, right=False)

# Display the dataset with the new feature
dataCopy.head()
Out[19]:
Unnamed: 0 Engine_ID Timestamp Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity Maintenance_Needed Remaining_used_life Hour_of_Day Day_Part
0 0 1 2023-01-01 00:00:00 529.106110 284.745866 3068.290695 1.764369 49.086087 9.164463 61.971112 1084.991320 55.206168 1 27.3 0 Night
1 1 1 2023-01-01 00:10:00 499.696207 270.464475 2693.907436 1.265819 47.371939 7.289872 79.359045 1016.557471 64.382801 1 21.7 0 Night
2 2 1 2023-01-01 00:20:00 480.154652 258.314496 3139.764855 1.311214 40.736703 7.144859 88.969967 902.039503 58.907223 1 22.3 0 Night
3 3 1 2023-01-01 00:30:00 531.114934 281.212266 3043.335177 1.128121 56.214109 6.547841 73.004457 408.693047 50.655640 0 20.9 0 Night
4 4 1 2023-01-01 00:40:00 499.197208 283.305874 3134.974852 1.567308 47.607585 8.252557 97.416898 1217.009675 48.393398 1 25.0 0 Night

Feature Scaling a.k.a outlier handler:¶

In [20]:
# Feature selections
labels = data[['Maintenance_Needed']]
features = data.drop(['Maintenance_Needed','Unnamed: 0','Engine_ID','Timestamp', 'Remaining_used_life'], axis=1)
In [21]:
features.head()
Out[21]:
Temperature Pressure Rotational_Speed Engine_Health Fuel_Consumption Vibration_Level Oil_Temperature Altitude Humidity
0 529.106110 284.745866 3068.290695 1.764369 49.086087 9.164463 61.971112 1084.991320 55.206168
1 499.696207 270.464475 2693.907436 1.265819 47.371939 7.289872 79.359045 1016.557471 64.382801
2 480.154652 258.314496 3139.764855 1.311214 40.736703 7.144859 88.969967 902.039503 58.907223
3 531.114934 281.212266 3043.335177 1.128121 56.214109 6.547841 73.004457 408.693047 50.655640
4 499.197208 283.305874 3134.974852 1.567308 47.607585 8.252557 97.416898 1217.009675 48.393398
In [22]:
print(labels.shape)
print(labels.squeeze().shape)
(720, 1)
(720,)
In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels.squeeze(), test_size=0.2, random_state=0)

# Standardize our training data

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))

X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))
In [ ]:
 

Machine Learning Modelling¶

In [24]:
X_train_scaled.shape
Out[24]:
(576, 9)
In [25]:
X_test_scaled.shape
Out[25]:
(144, 9)
In [26]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
In [27]:
# Initialize the model

log_reg_model = LogisticRegression(random_state=0)
In [28]:
# Train the model
log_reg_model.fit(X_train_scaled, y_train)
Out[28]:
LogisticRegression(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(random_state=0)
In [29]:
# Make predictions on the test set
y_pred = log_reg_model.predict(X_test_scaled)
In [30]:
y_pred
Out[30]:
array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0], dtype=int64)
In [31]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
In [32]:
# Display results
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
print("Confusion Matrix: \n", matrix)
Accuracy:  72.91666666666666
Classification Report: 
               precision    recall  f1-score   support

           0       0.58      0.47      0.52        45
           1       0.78      0.85      0.81        99

    accuracy                           0.73       144
   macro avg       0.68      0.66      0.67       144
weighted avg       0.72      0.73      0.72       144

Confusion Matrix: 
 [[21 24]
 [15 84]]
In [33]:
cm = confusion_matrix(y_test,y_pred)

ax = sns.heatmap(cm, cmap='flare',annot=True, fmt='d')

plt.xlabel("Predicted Class",fontsize=12) 
plt.ylabel("True Class",fontsize=12) 
plt.title("Confusion Matrix",fontsize=12)

plt.show()

Is this the best we can achieve ? Why not try different classification models?¶

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize and train the models
models = {
    "Logistic Regression" : LogisticRegression(random_state=0),
    "Decision Tree" : DecisionTreeClassifier(random_state=0),
    "Random Forest" : RandomForestClassifier(random_state=0),
    "SVC" : SVC(random_state=0)
         }
In [35]:
for key, val in models.items():
    print(key, "=", val)
Logistic Regression = LogisticRegression(random_state=0)
Decision Tree = DecisionTreeClassifier(random_state=0)
Random Forest = RandomForestClassifier(random_state=0)
SVC = SVC(random_state=0)
In [36]:
for model_name, model in models.items():
    
    # Training and prediction
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    matrix = confusion_matrix(y_test, y_pred)

    # Display results
    print(model_name)
    print("Accuracy: ", accuracy*100)
    print("Classification Report: \n", report)
    plt.figure(figsize=(5,3))
    #sns.heatmap(matrix, annot=True)
    sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
    plt.xlabel("Predicted Class",fontsize=10) 
    plt.ylabel("True Class",fontsize=10) 
    plt.title("Confusion Matrix",fontsize=10)
    plt.show()
    print("\n")
Logistic Regression
Accuracy:  72.91666666666666
Classification Report: 
               precision    recall  f1-score   support

           0       0.58      0.47      0.52        45
           1       0.78      0.85      0.81        99

    accuracy                           0.73       144
   macro avg       0.68      0.66      0.67       144
weighted avg       0.72      0.73      0.72       144


Decision Tree
Accuracy:  74.30555555555556
Classification Report: 
               precision    recall  f1-score   support

           0       0.58      0.62      0.60        45
           1       0.82      0.80      0.81        99

    accuracy                           0.74       144
   macro avg       0.70      0.71      0.71       144
weighted avg       0.75      0.74      0.75       144


Random Forest
Accuracy:  82.63888888888889
Classification Report: 
               precision    recall  f1-score   support

           0       0.76      0.64      0.70        45
           1       0.85      0.91      0.88        99

    accuracy                           0.83       144
   macro avg       0.81      0.78      0.79       144
weighted avg       0.82      0.83      0.82       144


SVC
Accuracy:  74.30555555555556
Classification Report: 
               precision    recall  f1-score   support

           0       0.62      0.47      0.53        45
           1       0.78      0.87      0.82        99

    accuracy                           0.74       144
   macro avg       0.70      0.67      0.68       144
weighted avg       0.73      0.74      0.73       144


Can we do better ? Why don't we select the most important features and train with them ?¶

In [37]:
import matplotlib.pyplot as plt

# Initialize and train the Random Forest model
radom_forest_model = RandomForestClassifier(random_state=0)
radom_forest_model.fit(X_train_scaled, y_train)

# Get feature importances
feature_importances =  radom_forest_model.feature_importances_

# Create a DataFrame to store feature names and their importance scores
feature_importances_df = pd.DataFrame({'Features': X_train.columns, 'Importance': feature_importances})

# Sort features by importance in descending order
feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Plot the feature importance
sns.barplot(x='Importance', y='Features', data=feature_importances_df, palette='viridis')
Out[37]:
<Axes: xlabel='Importance', ylabel='Features'>

Let's select the top 5 features¶

In [38]:
selected_features = ['Engine_Health', 'Vibration_Level', 'Rotational_Speed', 'Oil_Temperature']
In [39]:
# Split the data into training and testing sets

X_train_, X_test_, y_train_, y_test_ = train_test_split(data[selected_features], labels.squeeze(), test_size=0.2, 
                                                        random_state=0)

# Standardize our training data
scaler = StandardScaler()

X_train_scaled_ = scaler.fit_transform(X_train_.select_dtypes(include=['number']))

X_test_scaled_ = scaler.transform(X_test_.select_dtypes(include=['number']))
In [40]:
# Train, Test, and Evaluate Model 
for model_name, model in models.items():
    
    # Training and prediction
    model.fit(X_train_scaled_, y_train_)
    y_pred = model.predict(X_test_scaled_)

    # Evaluate the model
    accuracy = accuracy_score(y_test_, y_pred)
    report = classification_report(y_test_, y_pred)
    matrix = confusion_matrix(y_test_, y_pred)

    # Display results
    print(model_name)
    print("Accuracy: ", accuracy*100)
    print("Classification Report: \n", report)
    plt.figure(figsize=(4,2))
    sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
    plt.xlabel("Predicted Class",fontsize=10) 
    plt.ylabel("True Class",fontsize=10) 
    plt.title("Confusion Matrix",fontsize=10)
    plt.show()
    print("\n")
Logistic Regression
Accuracy:  71.52777777777779
Classification Report: 
               precision    recall  f1-score   support

           0       0.56      0.40      0.47        45
           1       0.76      0.86      0.81        99

    accuracy                           0.72       144
   macro avg       0.66      0.63      0.64       144
weighted avg       0.70      0.72      0.70       144


Decision Tree
Accuracy:  74.30555555555556
Classification Report: 
               precision    recall  f1-score   support

           0       0.57      0.69      0.63        45
           1       0.84      0.77      0.80        99

    accuracy                           0.74       144
   macro avg       0.71      0.73      0.72       144
weighted avg       0.76      0.74      0.75       144


Random Forest
Accuracy:  80.55555555555556
Classification Report: 
               precision    recall  f1-score   support

           0       0.70      0.67      0.68        45
           1       0.85      0.87      0.86        99

    accuracy                           0.81       144
   macro avg       0.77      0.77      0.77       144
weighted avg       0.80      0.81      0.80       144


SVC
Accuracy:  77.77777777777779
Classification Report: 
               precision    recall  f1-score   support

           0       0.68      0.56      0.61        45
           1       0.81      0.88      0.84        99

    accuracy                           0.78       144
   macro avg       0.74      0.72      0.73       144
weighted avg       0.77      0.78      0.77       144


Hyper-parameter Tunning¶

How can I automate this process of selecting the best parameters to train my model ?¶

In [41]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth' : [None, 10, 20],
    'min_samples_split' : [2,5,10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
radom_forest_model = RandomForestClassifier(random_state=0)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=radom_forest_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled_, y_train_)

# Get the best hyperparameters
best_param = grid_search.best_params_
best_param
Out[41]:
{'max_depth': None,
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 150}
In [42]:
# Train the model with the best hyperparameters
best_radom_forest_model = RandomForestClassifier(random_state=0, **best_param)
best_radom_forest_model.fit(X_train_scaled_, y_train_)

# Make predictions on the test set
y_pred = best_radom_forest_model.predict(X_test_scaled_)

# Evaluate the model
accuracy = accuracy_score(y_test_, y_pred)
report = classification_report(y_test_, y_pred)
matrix = confusion_matrix(y_test_, y_pred)

# Display results
print(model_name)
print("Accuracy: ", accuracy*100)
print("Classification Report: \n", report)
plt.figure(figsize=(4,2))
sns.heatmap(matrix, cmap='flare',annot=True, fmt='d')
plt.xlabel("Predicted Class",fontsize=10) 
plt.ylabel("True Class",fontsize=10) 
plt.title("Confusion Matrix",fontsize=10)
plt.show()
print("\n")
SVC
Accuracy:  84.02777777777779
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.67      0.72        45
           1       0.86      0.92      0.89        99

    accuracy                           0.84       144
   macro avg       0.82      0.79      0.81       144
weighted avg       0.84      0.84      0.84       144


Regression Analysis¶

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize linear regression model
lin_reg_model = LinearRegression()

labels = data[['Remaining_used_life']]
features = data.drop(['Maintenance_Needed','Unnamed: 0','Engine_ID','Timestamp', 'Remaining_used_life'], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels.squeeze(), test_size=0.2, random_state=0)

# Standardize our training data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=['number']))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=['number']))

# Training
lin_reg_model.fit(X_train_scaled, y_train)


# Prediction on the training and testing set
y_train_pred = lin_reg_model.predict(X_train_scaled)
y_test_pred = lin_reg_model.predict(X_test_scaled)

# Evaluate the model
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

train_r2_score = r2_score(y_train, y_train_pred)
test_r2_score = r2_score(y_test, y_test_pred)

print("Train RMSE: ", train_rmse)
print("Test RMSE: ", test_rmse)

print("Train R^2: ", train_r2_score)
print("Test R^2: ", test_r2_score)
Train RMSE:  0.47154438446230085
Test RMSE:  0.8599072583216776
Train R^2:  0.9819079382535665
Test R^2:  0.9276613414587991